from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')
import pandas as pd
df = pd.DataFrame(pd.read_csv('/Users/matthewvitha/Downloads/cleaned_hm.csv'))
df.head(2)
df_sense = pd.DataFrame(pd.read_csv('/Users/matthewvitha/Downloads/senselabel (2).csv'))
df_sense.head(2)
df_demo = pd.DataFrame(pd.read_csv('/Users/matthewvitha/Downloads/demographic (1).csv'))
df_demo.head(2)
print(df_demo.shape)
df_demo_nonan = df_demo.dropna(how='any')
df_demo_nonan.shape
df_demo_age = pd.DataFrame(pd.read_csv('/Users/matthewvitha/Downloads/demo_age.csv'))
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df_demo_age = df_demo_age.dropna()
sns.distplot(df_demo_age['age'])
plt.title('Age Distribution')
df_demo_nonan.head(2)
import pandas_profiling
pandas_profiling.ProfileReport(df_demo_nonan)
df['hm_length'] = df['cleaned_hm'].str.len()
sns.distplot(df['hm_length'])
plt.title('Length of Happy Moment - Distribution')
sns.set(style="whitegrid")
ax = sns.boxplot(x="hm_length", hue="reflection_period",data=df, palette="Set3")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)
f, ax = plt.subplots(figsize=(10, 5))
ax = sns.countplot(x="predicted_category", data=df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set_axisbelow(True)
plt.tight_layout()
plt.title("Frequency of Predicted Category Labels")
plt.grid(True, color='k', linestyle='-', linewidth=2)
plt.show()
sns.catplot(x="hm_length", y="predicted_category",kind='box',hue='reflection_period', palette="Blues",data=df)
figsize=(50,50)
plt.title('Length of Happy Moment by Predicted Category')
ax = sns.countplot(x="POS", data=df_sense)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.grid(True, color='k', linestyle='-', linewidth=2)
plt.title('Frequency of Different POS')
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)
f, ax = plt.subplots(figsize=(10, 5))
ax = sns.countplot(x="supersenseLabel", data=df_sense)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.title("Frequency of Different Super_sense Labels")
plt.grid(True, color='k', linestyle='-', linewidth=2)
plt.show()
import pandas as pd
df = pd.DataFrame(pd.read_csv('/Users/matthewvitha/Downloads/cleaned_hm.csv'))
import string
df['cleaned_hm'] = [i.lower() for i in df['cleaned_hm']]
#new_list = [expression(i) for i in old_list if filter(i)]
df['cleaned_hm'] = [i.translate(string.punctuation) for i in df['cleaned_hm']]
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
def preprocess(sentence):
sentence = sentence.lower()
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
filtered_words = [w for w in tokens if not w in stopwords.words('english')]
return " ".join(filtered_words)
df_small = df[:]
df_small['cleaned_hm'] = [preprocess(x) for x in df_small['cleaned_hm']]
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer=WordNetLemmatizer()
df_small['cleaned_hm'] = [lemmatizer.lemmatize(word) for word in df_small['cleaned_hm']]
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
lemma=nltk.stem.WordNetLemmatizer()
lemma_books = []
for book in df_small['cleaned_hm']:
lemma_book = [lemma.lemmatize(word) for word in wordpunct_tokenize(book)]
lemma_book = (' ').join(lemma_book)
lemma_books.append(lemma_book)
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
dtm = vectorizer.fit_transform(lemma_books).toarray()
vocab = np.array(vectorizer.get_feature_names())
dtm.shape
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(
dict(tfidf), orient='index')
tfidf.columns = ['tfidf']
tfidf.sort_values(by=['tfidf'], ascending=True).head(10)
tfidf.sort_values(by=['tfidf'], ascending=False).head(10)
from sklearn.decomposition import TruncatedSVD
n_comp=7
vz_sample = vectorizer.fit_transform(list(lemma_books))
svd = TruncatedSVD(n_components=n_comp, random_state=42)
svd_tfidf = svd.fit_transform(vz_sample)
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=250)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook, reset_output
from bokeh.palettes import d3
import bokeh.models as bmo
from bokeh.io import save, output_file
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600,
title="tf-idf clustering of the item description",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
tfidf_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y'])
tfidf_df['description'] = lemma_books
plot_tfidf.scatter(x='x', y='y', source=tfidf_df, alpha=0.7)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"description": "@description"}
show(plot_tfidf)
from sklearn.cluster import MiniBatchKMeans
num_clusters = 13 # need to be selected wisely
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters,
init='k-means++',
n_init=1,
init_size=1000, batch_size=1000, verbose=0, max_iter=250)
kmeans = kmeans_model.fit(vz_sample)
kmeans_clusters = kmeans.predict(vz_sample)
kmeans_distances = kmeans.transform(vz_sample)
# reduce dimension to 2 using tsne
tsne_kmeans = tsne_model.fit_transform(kmeans_distances)
#combined_sample.reset_index(drop=True, inplace=True)
kmeans_df = pd.DataFrame(tsne_kmeans, columns=['x', 'y'])
kmeans_df['cluster'] = kmeans_clusters
kmeans_df['description'] = tfidf_df['description']
#kmeans_df['cluster']=kmeans_df.cluster.astype(str).astype('category')
plot_kmeans = bp.figure(plot_width=700, plot_height=600,
title="KMeans clustering of the description",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
label_color_map = {0:'lightgrey',
1:'lightcoral',
2:'sandybrown',
3:'papayawhip',
4:'lemonchiffon',
5:'darkkhaki',
6:'yellow',
7:'greenyellow',
8:'lightgreen',
9:'aquamarine',
10:'darkkhaki',
11:'deepskyblue',
12:'dodgerblue',
13:'navy',
14:'blueviolet'}
label_color = [label_color_map[l] for l in kmeans_model.labels_]
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from bokeh.models import HoverTool, DatetimeTickFormatter,ColumnDataSource
source = ColumnDataSource(data=dict(x=kmeans_df['x'], y=kmeans_df['y'],
#color=colormap[kmeans_clusters],
color=label_color,
description=kmeans_df['description'],
cluster=kmeans_df['cluster']))
plot_kmeans.scatter(x='x', y='y', color='color', source=source)
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"description": "@description"}
show(plot_kmeans)
common_words = kmeans_model.cluster_centers_.argsort()[:,-1:-11:-1]
for num, centroid in enumerate(common_words):
print(str(num) + ' : ' + ', '.join(vocab[word] for word in centroid))
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
no_features = 1000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(lemma_books)
tf_feature_names = tf_vectorizer.get_feature_names()
no_topics = 9
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print(topic_idx)
print([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from subprocess import check_output
df_small_a = df_small
cleaned_hm_a = [str(i) for i in df_small_a['cleaned_hm']]
letters_only = re.sub("[^a-zA-Z]", # Search for all non-letters
" ", # Replace all non-letters with spaces
str(df_small_a['cleaned_hm']))
wordcloud = WordCloud(width = 15000, height = 2000,
background_color ='grey',max_words=100,
min_font_size = 10).generate(letters_only)
print(wordcloud)
fig = plt.figure(1)
fig.set_size_inches(15.5, 7.5)
#fig.savefig('test2png.png', dpi=100)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()